# -*- coding: utf-8 -*-
"""
Created on Fri Sep 13 09:46:48 2019

"""

### Natural Language Processing: detect position of adjective and identify the next noun
# first set the working directory
data_directory = "C:/Users/Administrator/Dropbox (Organisation and Innovation)/Marc - Gender Abstracts/Structured_abstracts/"

### Import packages 
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords


# open csv file
x = pd.read_csv(data_directory+'270k_abstracts_clinical.csv')
x['abstract_text']= x['abstract_text'].astype(str)
abstract = x['abstract_text'].tolist()
pmids = x['pmid'].tolist()

# create a variable containing all punctuation
punctuation = list(string.punctuation)

# create a variable with all stopwords
stop_words = stopwords.words('english')

# Import WordNet Lemmatizer
from nltk.stem.wordnet import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# define a function that tokenizes the text
def tok_sent(sent):
    
    # lower case everything
    sent = sent.lower()
    
    # Use nltk tokenizer
    sent = nltk.tokenize.WordPunctTokenizer().tokenize(sent)
        
    # Only keep alphabetical signs and remove punctuation
    sent = [token for token in sent if token not in punctuation and token.isalpha()]
        
    # Remove stopwords
    sent = [token for token in sent if token not in stop_words]
    
    # Remove words that are shorter than one character
    sent = [token for token in sent if len(token) > 1]
    
    # Return text
    return sent


# initialize a pmid vector and vector for tokenized abstracts
token_abstracts = []

# tokenize abstracts text with tokenizer function
for line in abstract:
    
    # Apply the NLP function to every abstract
    abstract_tokenized = tok_sent(line)
    token_abstracts.append(abstract_tokenized)
    
# initialize a count for the line we are in
i_line = 0

# initialize a list for the adjective that was found
adjective_list = []

# initialize a list with the corresponding pmid
pmid_list = []

# initialize a list with the next tokens in the sentence
next_tokens_list = []

# with the tokens look for the positive words (illustration)
for line in token_abstracts:
    
    # initialize a counter for the token we are in
    i_token = 0
    
    for token in line:
        
        if token == "novel":
            adjective = "novel"
            next_tokens = line[i_token + 1:]
            pmid = pmids[i_line]
            
            adjective_list.append(adjective)
            pmid_list.append(pmid)
            next_tokens_list.append(next_tokens)
        
        elif token == "promising":
            adjective = "promising"
            next_tokens = line[i_token + 1:]
            pmid = pmids[i_line]
            
            adjective_list.append(adjective)
            pmid_list.append(pmid)
            next_tokens_list.append(next_tokens)
         
        elif token == "robust":
            adjective = "robust"
            next_tokens = line[i_token + 1:]
            pmid = pmids[i_line]
            
            adjective_list.append(adjective)
            pmid_list.append(pmid)
            next_tokens_list.append(next_tokens)
            
        elif token == "unique":
            adjective = "unique"
            next_tokens = line[i_token + 1:]
            pmid = pmids[i_line]
            
            adjective_list.append(adjective)
            pmid_list.append(pmid)
            next_tokens_list.append(next_tokens)
            
        elif token == "favorable":
            adjective = "favorable"
            next_tokens = line[i_token + 1:]
            pmid = pmids[i_line]
            
            adjective_list.append(adjective)
            pmid_list.append(pmid)
            next_tokens_list.append(next_tokens)
        
        elif token == "favourable":
            adjective = "favorable"
            next_tokens = line[i_token + 1:]
            pmid = pmids[i_line]
            
            adjective_list.append(adjective)
            pmid_list.append(pmid)
            next_tokens_list.append(next_tokens)
        
        i_token += 1
    i_line += 1
    

# initialize a list with the next nouns
next_nouns = []

# Now go through the list of tokens and only store the first noun in each list
for line in next_tokens_list:
    
    
    # determine the role of the word
    pos_text = nltk.pos_tag(line)
    
    # Only keep nouns
    text = [word for word,pos in pos_text if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
    
    # lemmatize the nouns
    text = [wordnet_lemmatizer.lemmatize(word,"n") for word in text]
    
    # keep only the first entry of list of nouns
    if len(text) > 0: 
        next_noun =  text[0]
    
    else:
        next_noun = "NA"
    
    # append list of next nouns by the noun
    next_nouns.append(next_noun)
    
     
# Now create a file with pmids, adjectives, and nouns
pmid_nouns = pd.DataFrame()
pmid_nouns['pmid'] = pmid_list
pmid_nouns['adjective'] = adjective_list
pmid_nouns['next_noun'] = next_nouns

pmid_nouns.to_excel(data_directory+'pmid_adjectives_nouns_270k_clinical_abstracts.xlsx')


    


